Clean Data Using Pandas

Import the libraries we need


In [ ]:
import pandas as pd

Import data into Pandas


In [ ]:
# Use the fake dataset file we created
raw_data_file = 'data/customer_data_mini.csv'
raw_data = pd.read_csv(raw_data_file,
                        sep=',',
                        header=0,
                        index_col=False,
                        parse_dates=True,
                        tupleize_cols=False,
                        error_bad_lines=False,
                        warn_bad_lines=False,
                        skip_blank_lines=True
                        )
raw_data.head()

Rename column headers


In [ ]:
raw_data.columns

In [ ]:
raw_data.rename(columns={'birth date': 'date_of_birth',
                         'customer loyalty level': 'customer_loyalty_level',
                         'first name': 'first_name',
                         'last name': 'last_name',
                         'ssn': 'social_security_number',
                         'postcode': 'zipcode',
                         'job': 'position'}, inplace=True)
raw_data.columns

Fill in missing values


In [ ]:
# Replace all missing values a string - 'Missing'
raw_data.fillna('Missing', inplace=True)
raw_data

In [ ]:
# Replace all missing values with a 0
raw_data.fillna(0, inplace=True)

In [ ]:
# Common practice - replace all missing values with the mean of the dataframe
raw_data.fillna(raw_data.mean(), inplace=True)

In [ ]:
# Common practice - replace the missing values of a single colum with the mean of that column
# DO NOT RUN THIS if you ran the previous cell
raw_data['marketing_score'].fillna(raw_data.mean()['marketing_score'], inplace=True)
raw_data

Remove punctuation


In [ ]:
import string

exclude = set(string.punctuation)

def remove_punctuation(x):
    """
    Helper function to remove punctuation from a string
    x: any string
    """
    try:
        x = ''.join(ch for ch in x if ch not in exclude)
    except:
        pass
    return x

raw_data.last_name = raw_data.last_name.apply(remove_punctuation)
raw_data

Remove whitespace


In [ ]:
def remove_whitespace(x):
    """
    Helper function to remove any blank space from a string
    x: any string
    """
    try:
        x = "".join(x.split())
    except:
        pass
    return x

raw_data.last_name = raw_data.last_name.apply(remove_whitespace)
raw_data

Remove any string from within a string


In [ ]:
def remove_internal_abbreviations(s, thing_to_replace, replacement_string):
    """
    Helper function to remove things such as 'F/K/A' from a string
    s: the string to replace a value in
    thing_to_replace: what you want to replace in the given string
    replacement_string: the string to use as a replacement
    """
    try:
        s = s.replace(thing_to_replace, replacement_string)
    except:
        pass
    return s

raw_data['last_name'] = raw_data.apply(lambda x: remove_internal_abbreviations(x['last_name'], "FKA", "-"), axis=1)
raw_data

In [ ]: